data <- read.csv('GSE37704.csv')
l.data <- log(data[,-1] + 1)
boxplot(l.data, col = c(rep(2,3),rep(3,3)), main="Boxplot of log of dataset")
lowCount <- NULL
constantRead <- NULL
result <- NULL
Sign <- NULL
MW_U <- NULL
for(i in 1:nrow(l.data)){
ctrl <- as.numeric( l.data[i, 1:3] )
exp <- as.numeric( l.data[i, 4:6] )
if( var(ctrl) == 0 ||
var(exp) == 0){
#variance of both group are 0, cannot use T test
constantRead <- c(constantRead, i)
result<-c(result, NA)
MW_U[i] <- NA
Sign[i] <- NA
next
}
S <- wilcox.test(ctrl, exp)
MW <- wilcox.test(ctrl, exp,correct = F)
Sign[i] <- S$p.value
MW_U[i] <- MW$p.value
}
a <- which.min(Sign)
b <- which.min(MW_U)
l.data[a,]
## control_1 control_2 control_3 Hoxa1KN_1 Hoxa1KN_2 Hoxa1KN_3
## 104 2.772589 2.944439 2.772589 1.098612 1.098612 1.386294
l.data[b,]
## control_1 control_2 control_3 Hoxa1KN_1 Hoxa1KN_2 Hoxa1KN_3
## 104 2.772589 2.944439 2.772589 1.098612 1.098612 1.386294
rowMean <- rowMeans(data[2:7])
summary(rowMean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 2.2 120.7 910.8 660.8 417990.0
plot_ly(x=rowMean, type='histogram')
By playing with this graph, we can see a lot ‘low count’ genes. They need to be filtered out.
df <- data[rowMean > 1200,]
plot_ly(x=rowMeans(df[,-1]), type='histogram')
PCA
rownames(df) <- df$ensgene
df <- df[,-1]
pcaResult <- princomp(df)
plot(pcaResult, type = 'l')
pcaHC <- hclust(dist(pcaResult$scores), method='ward.D2')
plot(pcaHC)
geneCluster <- cutree(pcaHC, k=3)
geneDf.Temp <- data.frame(pcaResult$scores, cluster = factor(geneCluster))
geneDf <- transform(geneDf.Temp, cluster_name = paste('Cluster', geneCluster) )
#
# plot_ly(geneDf,
# x = geneDf$Comp.1,
# y = geneDf$Comp.2,
# mode = "markers",
# text = rownames(df),
# color = geneDf$cluster_name, marker = list(size = 2))
#
# p <- layout(p, title = "PCA Result",
# xaxis = list(title = "PC 1"),
# yaxis = list(title = "PC 2"))
#
# p